Working Directory: /Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance/ACMG_Penetrance
setwd("/Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance")
#save.image("/Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance/Environ_2017-01-09.RData")
load("/Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance/Environ_2017-01-09.RData")
\newpage
http://www.ncbi.nlm.nih.gov/clinvar/docs/acmg/
cat(sprintf("Table from ACMG SF v2.0 Paper %s x %s (selected rows):", nrow(ACMG.table), ncol(ACMG.table)))
## Table from ACMG SF v2.0 Paper 60 x 8 (selected rows):
row.names(ACMG.table) <- paste0("N",1:nrow(ACMG.table))
ACMG.table[1:5,] %>% pander
| Phenotype | MIM_disorder | PMID_Gene_Reviews_entry | |
|---|---|---|---|
| N1 | Hereditary breast and ovarian cancer | 604370|612555 | 20301425 |
| N2 | Hereditary breast and ovarian cancer | 604370|612555 | 20301425 |
| N3 | Li-Fraumeni syndrome | 151623 | 20301488 |
| N4 | Peutz-Jeghers syndrome | 175200 | 20301443 |
| N5 | Lynch syndrome | 120435 | 20301390 |
Table continues below
| Typical_age_of_onset | Gene | MIM_gene | Inheritance | Variants_to_report | |
|---|---|---|---|---|---|
| N1 | Adult | BRCA1 | 113705 | AD | KP&EP |
| N2 | Adult | BRCA2 | 600185 | AD | KP&EP |
| N3 | Child/Adult | TP53 | 191170 | AD | KP&EP |
| N4 | Child/Adult | STK11 | 602216 | AD | KP&EP |
| N5 | Adult | MLH1 | 120436 | AD | KP&EP |
cat("ACMG-59 Genes:")
## ACMG-59 Genes:
print(ACMG.panel, quote = F)
## [1] BRCA1 BRCA2 TP53 STK11 MLH1 MSH2 MSH6 PMS2
## [9] APC MUTYH BMPR1A SMAD4 VHL MEN1 RET PTEN
## [17] RB1 SDHD SDHAF2 SDHC SDHB TSC1 TSC2 WT1
## [25] NF2 COL3A1 FBN1 TGFBR1 TGFBR2 SMAD3 ACTA2 MYH11
## [33] MYBPC3 MYH7 TNNT2 TNNI3 TPM1 MYL3 ACTC1 PRKAG2
## [41] GLA MYL2 LMNA RYR2 PKP2 DSP DSC2 TMEM43
## [49] DSG2 KCNQ1 KCNH2 SCN5A LDLR APOB PCSK9 ATP7B
## [57] OTC RYR1 CACNA1S
ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz
ClinVar is the central repository for variant interpretations. Relevant information from the VCF includes:
(a) CLNSIG = “Variant Clinical Significance, 0 - Uncertain, 1 - Not provided, 2 - Benign, 3 - Likely benign,
4 - Likely pathogenic, 5 - Pathogenic, 6 - Drug response, 7 - Histocompatibility, 255 - Other”
(b) CLNDBN = “Variant disease name”
(c) CLNDSDBID = “Variant disease database ID”
(d) INTERP = Pathogenicity (likely pathogenic or pathogenic; CLNSIG = 4 or 5)
#Number to interpretation
clnsig_map <- c(0:7,255,-1) %>% setNames(c("Uncertain",
"Not_Provided","Benign","Likely_Benign","Likely_Pathogenic",
"Pathogenic","Drug_Response","Histocompatibility","Other","NA"))
#"no_assertion - No assertion provided,
#no_criteria - No assertion criteria provided,
#single - Criteria #provided single submitter,
#mult - Criteria provided multiple submitters no conflicts,
#conf - Criteria #provided conflicting interpretations,
#exp - Reviewed by expert panel,
#guideline - Practice guideline"
get_clinvar <- function(clinvar_file) {
file.by.line <- readLines(clinvar_file)
#file_date <- as.Date(strsplit(file.by.line[2],"=")[[1]][2], "%Y%m%d")
#system(sprintf("mv %s ClinVar_Reports/clinvar_%s.vcf", clinvar_file, file_date))
clean.lines <- file.by.line[!grepl("##.*", file.by.line)] #Remove ## comments
clean.lines[1] <- sub('.', '', clean.lines[1]) #Remove # from header
input <- read.table(text = paste(clean.lines, collapse = "\n"), header = T, stringsAsFactors = F,
comment.char = "", quote = "", sep = "\t")
input <- input[nchar(input$REF)==1,] #deletions
alt_num <- sapply(strsplit(input$ALT,","),length) #number of alts
acceptable_nchar <- 2*alt_num-1 #adds in the length from commas, if each alt is 1 nt.
input <- input[nchar(input$ALT)==acceptable_nchar,] #insertions
input$ALT <- strsplit(input$ALT,",")
split_all <- strsplit(input$INFO,";")
has.clndsdbid <- any(grepl("CLNDSDBID", split_all))
has.clnrevstat <- any(grepl("CLNREVSTAT", split_all))
split_info <- function(name) {
sapply(split_all, function(entry) {
entry[grep(name,entry)]
}) %>% strsplit("=") %>% sapply(function(x) x[2]) %>% strsplit(",")
}
input$CLNALLE <- split_info("CLNALLE=") %>% sapply(as.integer)
input$CLNSIG <- split_info("CLNSIG=")
input$CLNDBN <- split_info("CLNDBN=")
if (has.clnrevstat)
input$CLNREVSTAT <- split_info("CLNREVSTAT=")
if (has.clndsdbid)
input$CLNDSDBID <- split_info("CLNDSDBID=")
#CLNALLE has 0,-1,3,4 --> CLNSIG has 1,2,3,4 --> ALT has 1.
taking <- sapply(input$CLNALLE, function(x) x[x>0] ) #Actual elements > 0. Keep these in CLNSIG and ALT
taking_loc <- sapply(input$CLNALLE, function(x) which(x>0) )#Tracks locations for keeping in CLNALLE
keep <- sapply(taking, length)>0 #reduce everything to get rid of 0 and -1
# Reduce, reduce, reduce.
taking <- taking[keep]
taking_loc <- taking_loc[keep]
input <- input[keep,]
#Make this more readable
input$ALT <- sapply(1:nrow(input), function(row) {
input$ALT[[row]][taking[[row]]]
})
col_subset <- function(name) {
sapply(1:nrow(input), function(row) {
unlist(input[row,name])[taking_loc[[row]]]
})
}
input$CLNSIG <- col_subset("CLNSIG")
input$CLNALLE <- col_subset("CLNALLE")
input$CLNDBN <- col_subset("CLNDBN")
if (has.clnrevstat)
input$CLNREVSTAT <- col_subset("CLNREVSTAT")
if (has.clndsdbid)
input$CLNDSDBID <- col_subset("CLNDSDBID")
filter_condition <- input[,unlist(lapply(input, typeof))=="list"] %>%
apply(1,function(row) !any(is.na(row)))
input <- input %>% filter(filter_condition) %>%
unnest %>% unite(VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>%
select(VAR_ID, CHROM, POS, ID, REF, ALT, CLNALLE, CLNSIG, everything()) %>%
mutate(CLNSIG = strsplit(CLNSIG,"|",fixed = T)) %>%
mutate(CLNDBN = strsplit(CLNDBN,"|",fixed = T)) %>%
mutate(POS = as.integer(POS))
if (has.clnrevstat)
input <- input %>% mutate(CLNREVSTAT = strsplit(CLNREVSTAT,"|",fixed = T))
if (has.clndsdbid)
input <- input %>% mutate(CLNDSDBID = strsplit(CLNDSDBID,"|",fixed = T))
input$CLNSIG <- sapply(input$CLNSIG, function(x) as.integer(x))
input$INTERP <- sapply(input$CLNSIG, function(x) any(x %in% c(4,5)) )
input
}
clinvar <- get_clinvar(clinvar_file)
#clinvar[duplicated(clinvar$VAR_ID),1:8]
clinvar <- clinvar[!duplicated(clinvar$VAR_ID),]
cat(sprintf("Processed ClinVar data frame %s x %s (selected rows/columns):", nrow(clinvar), ncol(clinvar)))
## Processed ClinVar data frame 204730 x 15 (selected rows/columns):
clinvar[5:8,] %>% select(-CLNALLE, -INFO, -QUAL, -FILTER) %>% remove_rownames %>% pander
| VAR_ID | CHROM | POS | ID | REF | ALT | CLNSIG |
|---|---|---|---|---|---|---|
| 1_957568_A_G | 1 | 957568 | rs115704555 | A | G | 2 |
| 1_957605_G_A | 1 | 957605 | rs756623659 | G | A | 5 |
| 1_957640_C_T | 1 | 957640 | rs6657048 | C | T | 255 |
| 1_957693_A_T | 1 | 957693 | rs879253787 | A | T | 5 |
Table continues below
| CLNDBN | CLNREVSTAT | CLNDSDBID | INTERP |
|---|---|---|---|
| not_specified | single | CN169374 | FALSE |
| Congenital_myasthenic_syndrome | no_criteria | C0751882:ORPHA590 | TRUE |
| not_specified | conf | CN169374 | FALSE |
| Congenital_myasthenic_syndrome | no_criteria | C0751882:ORPHA590 | TRUE |
unlink(clinvar_file)
ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.[chrom].phase3_[version].20130502.genotypes.vcf.gz
Downloaded 1000 Genomes VCFs are saved in: /Users/jamesdiao/Documents/Kohane_Lab/2017-ACMG-penetrance/1000G/
download_1000g <- function(gene, download) {
#for tracking: #gene %>% paste(which(ACMG.panel==gene)) %>% paste(length(ACMG.panel), sep = "/") %>% print
success <- FALSE
refGene <- sprintf("select * from refGene where name2 = \"%s\" limit 20", gene) %>% query
UCSC <- select(refGene, name, chrom, start = txStart, end = txEnd)
if (nrow(UCSC) == 0) { #No hit on refGene
return(rep("NOT_FOUND",5) %>% setNames(c("name","chrom","start","end","downloaded")))
} else {
if (nrow(UCSC) > 1) #Multiple hits: take the widest range
UCSC <- UCSC[which.max(UCSC$end-UCSC$start),]
if (download) {
# gets [n] from chr[n]
chrom.num <- strsplit(UCSC$chrom, split = "chr")[[1]][2]
# different version for chromosomes X and Y
version <- switch(chrom.num, "X" = "shapeit2_mvncall_integrated_v1b",
"Y" = "integrated_v2a", "shapeit2_mvncall_integrated_v5a")
command <- paste("tabix -h ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.%s.",
"phase3_%s.20130502.genotypes.vcf.gz %s:%s-%s > %s_genotypes.vcf", sep = "")
sprintf(command, UCSC$chrom, version, chrom.num, UCSC$start, UCSC$end, gene) %>% system
Sys.sleep(2)
# Checks whether the file exists and has non-zero size
exists <- grepl(paste(gene,"_genotypes.vcf",sep =""), system("ls", intern = T)) %>% sum > 0
file.size <- strsplit(paste("stat ","_genotypes.vcf", sep = gene) %>%
system(intern = T), " ")[[1]][8]
success <- exists & file.size > 0
}
}
return(c(UCSC,"downloaded" = success))
}
if (!skip_download & !skip_processing) {
system("mkdir 1000G")
setwd(paste(getwd(), "1000G", sep = "/"))
for (con in dbListConnections(MySQL())) dbDisconnect(con)
con <- dbConnect(MySQL(), user = 'genome',
dbname = 'hg19', host = 'genome-mysql.cse.ucsc.edu',
unix.sock = "/Applications/MAMP/tmp/mysql/mysql.sock")
query <- function (input) { suppressWarnings(dbGetQuery(con, input)) }
download_output <- sapply(ACMG.panel, function(gene) download_1000g(gene, download = T)) %>% t
print(download_output, quote = F)
download_output <- download_output %>%
apply(2, unlist) %>%
as.data.frame(stringsAsFactors = F) %>%
mutate("gene" = rownames(download_output)) %>%
select(gene, everything()) %>%
filter(downloaded != "NOT_FOUND")
download_output <- download_output %>%
mutate(chrom = sapply(strsplit(download_output$chrom,"chr"), function(x) x[2]),
start = as.integer(start), end = as.integer(end),
downloaded = as.logical(downloaded))
write.table(download_output, file = "download_output.txt",
row.names = F, col.names = T, quote = F, sep = "\t")
system("rm *.genotypes.vcf.gz.tbi")
setwd("../")
} else {
if (skip_download | skip_processing) {
download_output <- read.table("Supplementary_Files/download_output.txt", header = T, stringsAsFactors = F)
} else {
download_output <- read.table("1000G/download_output.txt", header = T, stringsAsFactors = F)
}
}
cat(sprintf("Download report: region and successes: %s x %s (selected rows):", nrow(download_output), ncol(download_output)))
## Download report: region and successes: 59 x 6 (selected rows):
download_output[1:5,] %>% format(scientific = F) %>% pander
| gene | name | chrom | start | end | downloaded |
|---|---|---|---|---|---|
| BRCA1 | NM_007294 | 17 | 41196311 | 41277500 | TRUE |
| BRCA2 | NM_000059 | 13 | 32889616 | 32973809 | TRUE |
| TP53 | NM_000546 | 17 | 7571719 | 7590868 | TRUE |
| STK11 | NM_000455 | 19 | 1205797 | 1228434 | TRUE |
| MLH1 | NM_000249 | 3 | 37034840 | 37092337 | TRUE |
cat("File saved as download_output.txt in Supplementary_Files")
## File saved as download_output.txt in Supplementary_Files
import.file.1000g <- function(gene) {
#for tracking:
sprintf("%s [%s/%s]", gene, grep(gene, ACMG.panel), length(ACMG.panel)) %>% print(quote = F)
name <- paste("1000G",paste(gene,"genotypes.vcf", sep = "_"), sep = "/")
output <- read.table(paste(getwd(),name,sep="/"), stringsAsFactors = FALSE)
#Add header
names(output)[1:length(header)] <- header
#Remove all single alt indels
output <- output[nchar(output$REF)==1,] #deletions
alt_num <- sapply(strsplit(output$ALT,","),length) #number of alts
acceptable_nchar <- 2*alt_num-1 #adds in the length from commas, if each alt is 1 nt.
output <- output[nchar(output$ALT)==acceptable_nchar,] #insertions
alt_num <- sapply(strsplit(output$ALT,","),length) #recalculate
paired = which(alt_num!=1) #all with ,
#Add AF Column
af <- strsplit(output$INFO,";") %>% sapply("[", 2) %>%
strsplit("AF=") %>% sapply("[", 2) %>% strsplit(",") %>% sapply(as.numeric)
output <- cbind(GENE = gene, "AF_1000G"=I(af), output) #Places it at the front of output
front_cols <- 1:(grep("HG00096",colnames(output))-1)
if (length(paired)!=0) {
#Limit max vector length by sapply(strsplit(output$ALT,","),length)
sapply(paired, function(rownum) { #For every row
sapply(as.character(1:alt_num[rownum]), function(num) {
grepl(paste(num,"|",sep = ""), output[rownum,-front_cols], fixed=T) +
grepl(paste("|",num,sep = ""), output[rownum,-front_cols], fixed=T)
}) %>% t -> temp
split(temp, rep(1:ncol(temp), each = nrow(temp))) %>% setNames(NULL)
#Separate into list of vectors (1 entry for counting each ALT)
}) %>% t -> insert
insert <- cbind(output[paired,front_cols],insert)
colnames(insert) <- colnames(output)
insert <- insert %>% #adds front_col info
mutate(ALT = strsplit(ALT,",")) %>% #Splits ALTS
unnest() %>% #Unnests everything
select(GENE, AF_1000G, CHROM, POS, ID, REF, ALT, everything()) #Reorders everything
output <- output[-paired,] #Removes paired
}
output <- cbind(output[,front_cols],
apply(output[,-front_cols], 2, function(y) {
grepl("1|", y, fixed=T) +
grepl("|1", y, fixed=T)
}) ) #convert to logical
if (length(paired)!=0)
output <- rbind(output, insert) #joins the two
output$AF_1000G <- as.numeric(output$AF_1000G)
unite(output, VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>% arrange(VAR_ID)
#Make VAR_ID, arrange by VAR_ID
}
if (skip_processing) {
#saveRDS(ACMG.1000g, file = "ACMG_Penetrance/ACMG_1000G.rds")
ACMG.1000g <- readRDS(file = "ACMG_Penetrance/ACMG_1000G.rds")
} else {
# Import 1000G data for all ACMG
ACMG.1000g <- NULL
header <- c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", as.character(map$sample))
for (gene in ACMG.panel) {
#print(sprintf("[%d/%d] %s",which(gene==ACMG.panel),length(ACMG.panel),gene))
ACMG.1000g <- rbind(ACMG.1000g,import.file.1000g(gene))
}
#Display and remove duplicates
#ACMG.1000g[duplicated(ACMG.1000g$VAR_ID),1:8]
ACMG.1000g <- ACMG.1000g[!duplicated(ACMG.1000g$VAR_ID),]
}
#write.csv(ACMG.1000g, file = "ACMG.1000G.csv", row.names = F, quote = F)
#save(ACMG.1000g, file = "ACMG.1000G")
cat(sprintf("Processed 1000 Genomes VCFs: %s x %s (selected rows/columns):", nrow(ACMG.1000g), ncol(ACMG.1000g)))
## Processed 1000 Genomes VCFs: 141467 x 2516 (selected rows/columns):
ACMG.1000g[1:5,1:18] %>% select(-INFO, -QUAL, -FILTER, -FORMAT) %>%
format(scientific = F) %>% pander
| GENE | AF_1000G | VAR_ID | CHROM | POS | ID | REF | ALT |
|---|---|---|---|---|---|---|---|
| BRCA1 | 0.004193290 | 17_41196363_C_T | 17 | 41196363 | rs8176320 | C | T |
| BRCA1 | 0.008386580 | 17_41196368_C_T | 17 | 41196368 | rs184237074 | C | T |
| BRCA1 | 0.000998403 | 17_41196372_T_C | 17 | 41196372 | rs189382442 | T | C |
| BRCA1 | 0.342252000 | 17_41196408_G_A | 17 | 41196408 | rs12516 | G | A |
| BRCA1 | 0.000399361 | 17_41196409_G_C | 17 | 41196409 | rs548275991 | G | C |
Table continues below
| HG00096 | HG00097 | HG00099 | HG00100 | HG00101 | HG00102 |
|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 |
| 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 1 | 1 | 0 | 2 |
| 0 | 0 | 0 | 0 | 0 | 0 |
if (rm_rs1805124)
ACMG.1000g <- ACMG.1000g[ACMG.1000g$ID!="rs1805124",]
import.file.exac <- function(gene, dataset) {
file_name <- sprintf("%s/%s_%s.csv", dataset, dataset, gene)
output <- read.csv(file_name, stringsAsFactors = FALSE)
output$Number.of.Hemizygotes <- NULL #Inconsistently present column; removal allows row aggregation
# Correcting for some alternate naming conventions
if ("Conseq." %in% colnames(output))
output <- output %>% rename(Consequence = Conseq.)
if ("Count" %in% colnames(output))
output <- output %>% rename(Allele.Count = Count)
# Imputing missing South Asian values for NF2
if (!("Allele.Count.South.Asian" %in% colnames(output))) {
output$Allele.Number.South.Asian <- (2*output$Allele.Number) -
(output %>% select(contains("Allele.Number")) %>% rowSums)
output$Allele.Count.South.Asian <- (2*output$Allele.Count) -
(output %>% select(contains("Allele.Count")) %>% rowSums)
output$Homozygote.Count.South.Asian <- (2*output$Number.of.Homozygotes) -
(output %>% select(contains("Homozygote")) %>% rowSums)
}
output <- cbind(GENE = gene, output[nchar(paste(output$Alternate,output$Reference))==3,]) %>%
select(GENE, AF_EXAC = contains("Freq"), CHROM=Chrom, POS=Position,
ID=RSID, REF=Reference, ALT=Alternate, Annotation = contains("Annot"), everything()) %>%
unite(VAR_ID, CHROM, POS, REF, ALT, sep = "_", remove = F) %>% arrange(VAR_ID)
tags <- list("African","Latino","East.Asian","European","South.Asian")
european <- output %>% select(contains("Finnish"), contains("European"))
if (dataset == "gnomad") {
european <- output %>% select(contains("Finnish"), contains("European"), contains("Jewish"))
output <- output %>% select(GENE, AF_GNOMAD = AF_EXAC, everything())
}
output$Allele.Count.European <- european %>% select(contains("Allele.Count")) %>% rowSums
output$Allele.Number.European <- european %>% select(contains("Allele.Number")) %>% rowSums
exac_af <- output[,sprintf("Allele.Count.%s", tags)] / output[,sprintf("Allele.Number.%s", tags)]
colnames(exac_af) <- sprintf("AF_%s_%s", toupper(dataset), c("AFR","AMR","EAS","EUR","SAS"))
output <- cbind(output, exac_af) %>%
select(GENE, contains(toupper(dataset)), everything())
if (rm_rs1805124)
output <- output[output$ID!="rs1805124",]
return(output)
}
# Import ExAC data for all ACMG
ACMG.exac <- NULL
ACMG.gnomad <- NULL
for (gene in ACMG.panel) {
#print(sprintf("[%d/%d] %s",which(gene==ACMG.panel),length(ACMG.panel),gene))
ACMG.exac <- rbind(ACMG.exac,import.file.exac(gene, "exac"))
ACMG.gnomad <- rbind(ACMG.gnomad,import.file.exac(gene, "gnomad"))
}
#Display and remove duplicates
#ACMG.exac[duplicated(ACMG.exac$VAR_ID),1:8]
#ACMG.gnomad[duplicated(ACMG.gnomad$VAR_ID),1:8]
ACMG.exac <- ACMG.exac[!duplicated(ACMG.exac$VAR_ID),]
ACMG.gnomad <- ACMG.gnomad[!duplicated(ACMG.gnomad$VAR_ID),]
#write.csv(ACMG.exac, file = "ACMG.ExAC.csv", row.names = F, quote = F)
#write.csv(ACMG.gnomad, file = "ACMG.gnomAD.csv", row.names = F, quote = F)
cat(sprintf("Processed gnomAD VCFs: %s x %s (selected rows/columns):", nrow(ACMG.gnomad), ncol(ACMG.gnomad)))
## Processed gnomAD VCFs: 96742 x 48 (selected rows/columns):
ACMG.gnomad[sample(nrow(ACMG.gnomad) %>% sort,5),c(1,2,8)] %>% format(scientific = F) %>% pander
| GENE | AF_GNOMAD | VAR_ID | |
|---|---|---|---|
| 51192 | TNNT2 | 0.00000416 | 1_201336100_A_T |
| 2952 | BRCA2 | 0.00000823 | 13_32912812_A_G |
| 114414 | ATP7B | 0.00000803 | 13_52532398_T_G |
| 19971 | VHL | 0.00003310 | 3_10188348_T_G |
| 20285 | MEN1 | 0.00009930 | 11_64571649_C_T |
cat(sprintf("Processed ExAC VCFs: %s x %s (selected rows/columns):", nrow(ACMG.exac), ncol(ACMG.exac)))
## Processed ExAC VCFs: 59883 x 45 (selected rows/columns):
ACMG.exac[sample(nrow(ACMG.exac),5) %>% sort,c(1,2,8)] %>% format(scientific = F) %>% pander
| GENE | AF_EXAC | VAR_ID | |
|---|---|---|---|
| 123 | BRCA1 | 0.000016580 | 17_41203072_G_A |
| 32607 | TNNT2 | 0.000008364 | 1_201342228_C_T |
| 34038 | MYL3 | 0.000065900 | 3_46904707_T_C |
| 323101 | LDLR | 0.000008256 | 19_11218197_C_A |
| 307010 | RYR1 | 0.000008529 | 19_39005659_C_G |
This will allow us to assign genotypes from the 1000 Genomes VCF to ancestral groups.
From: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel
#read the map and delete the file
map <- read.table(file = "Supplementary_Files/phase3map.txt", stringsAsFactors = F, header = T) %>% as.data.frame
#display
cat("Phase 3 Populations Map Table: 2504 x 4 (selected rows)")
## Phase 3 Populations Map Table: 2504 x 4 (selected rows)
map[sample(nrow(map),6),] %>% arrange(super_pop) %>% remove_rownames %>% pander
| sample | pop | super_pop | gender |
|---|---|---|---|
| HG01989 | ACB | AFR | female |
| HG02676 | GWD | AFR | female |
| HG02014 | ACB | AFR | male |
| HG01069 | PUR | AMR | male |
| HG01344 | CLM | AMR | male |
| HG03792 | ITU | SAS | male |
#Make list of populations and superpopulations for later plotting
pop.table <- map[!duplicated(map$pop),] %>%
select(contains("pop")) %>% arrange(super_pop, pop)
super <- pop.table$super_pop %>% setNames(pop.table$pop)
super.levels <- unique(pop.table$super_pop)
pop.levels <- unique(pop.table$pop)
#Plot distribution of ancestral backgrounds
#Population = factor(as.character(map$pop), levels = pop.levels)
#cat("Population Distribution")
#ggplot(map, aes(map$super_pop, fill = Population)) +
# geom_bar(color = 'black', width = 0.5) +
# ylab ("No. of Individuals") + xlab ("Superpopulation") +
# ggtitle("1000 Genomes - Samples by Population")
#rm(Population)
if (!("AF_1000G_AFR" %in% colnames(ACMG.1000g))) {
front_cols <- 1:(grep("HG00096",colnames(ACMG.1000g))-1)
sapply(super.levels, function(superpop){
keep <- map$super_pop == superpop
(ACMG.1000g[,length(front_cols)+which(keep)] %>% rowSums)/(2*sum(keep))
}) -> pop_af
colnames(pop_af) <- sprintf("AF_1000G_%s",super.levels)
ACMG.1000g <- data.frame(ACMG.1000g, pop_af) %>%
select(GENE, AF_1000G, VAR_ID, CHROM, POS, ID, REF, ALT,
AF_1000G_AFR, AF_1000G_AMR, AF_1000G_EAS, AF_1000G_EUR, AF_1000G_SAS, everything())
rm(pop_af)
}
merge_clinvar_1000g <- function() {
inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.1000g$VAR_ID)
clinvar_merged <- clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID)
ACMG_merged <- ACMG.1000g[ACMG.1000g$VAR_ID %in% inter,] %>% arrange(VAR_ID)
front_cols <- 1:(grep("HG00096",colnames(ACMG.1000g))-1)
cbind(ACMG_merged[,c("GENE",sprintf("AF_1000G%s", c("",paste0("_",super.levels))))],
clinvar_merged,ACMG_merged[,-front_cols])
}
merged_1000g <- merge_clinvar_1000g()
inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.exac$VAR_ID)
merged_exac <- cbind(clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID),
ACMG.exac %>% select(VAR_ID, contains("AF_"), GENE) %>%
filter(VAR_ID %in% inter) %>% arrange(VAR_ID) %>% select(-VAR_ID)
) %>% select(VAR_ID, GENE, AF_EXAC, contains("AF_"), everything())
inter <- intersect(clinvar$VAR_ID[clinvar$INTERP], ACMG.gnomad$VAR_ID)
merged_gnomad <- cbind(clinvar[(clinvar$VAR_ID %in% inter),] %>% arrange(VAR_ID),
ACMG.gnomad %>% select(VAR_ID, contains("AF_"), GENE) %>%
filter(VAR_ID %in% inter) %>% arrange(VAR_ID) %>% select(-VAR_ID)
) %>% select(VAR_ID, GENE, AF_GNOMAD, contains("AF_"), everything())
#count up all pathogenic ClinVar in ACMG regions
is.acmg <- function(row) {
genes <- which(row$CHROM == download_output$chrom)
sapply(genes, function(gene) {
between(row$POS, download_output[gene,]$start, download_output[gene,]$end)
}) %>% any
}
cat("Breakdown of ClinVar Variants")
## Breakdown of ClinVar Variants
data.frame(Subset_ClinVar = c("Total ClinVar","LP/P","ACMG LP/P",
"ACMG LP/P in gnomAD", "ACMG LP/P in ExAC","ACMG LP/P in 1000 Genomes"),
Number_of_Variants = c(nrow(clinvar),
sum(clinvar$INTERP),
sum(apply(clinvar[clinvar$INTERP,], 1, is.acmg)),
nrow(merged_gnomad),
nrow(merged_exac),
nrow(merged_1000g))) %>% pander
| Subset_ClinVar | Number_of_Variants |
|---|---|
| Total ClinVar | 204730 |
| LP/P | 34152 |
| ACMG LP/P | 6781 |
| ACMG LP/P in gnomAD | 1179 |
| ACMG LP/P in ExAC | 845 |
| ACMG LP/P in 1000 Genomes | 135 |
breakdown <- function(dataset) {
dataset_name <- ifelse(dataset == "1000G", dataset, tolower(dataset))
ACMG.data <- parse(text=sprintf("ACMG.%s", tolower(dataset))) %>% eval
sprintf("Breakdown of ACMG-%s Variants", dataset) %>% cat
data.frame(Subset_gnomAD = sprintf(c("ACMG in %s", "ClinVar-ACMG in %s", "LP/P-ACMG in %s"), dataset),
Number_of_Variants = c(nrow(ACMG.data),
intersect(clinvar$VAR_ID, ACMG.data$VAR_ID) %>% length,
parse(text=sprintf("merged_%s", tolower(dataset))) %>% eval %>% nrow
)
) %>% pander
}
breakdown("gnomAD")
## Breakdown of ACMG-gnomAD Variants
| Subset_gnomAD | Number_of_Variants |
|---|---|
| ACMG in gnomAD | 96742 |
| ClinVar-ACMG in gnomAD | 13897 |
| LP/P-ACMG in gnomAD | 1179 |
breakdown("ExAC")
## Breakdown of ACMG-ExAC Variants
| Subset_gnomAD | Number_of_Variants |
|---|---|
| ACMG in ExAC | 59883 |
| ClinVar-ACMG in ExAC | 10778 |
| LP/P-ACMG in ExAC | 845 |
breakdown("1000G")
## Breakdown of ACMG-1000G Variants
| Subset_gnomAD | Number_of_Variants |
|---|---|
| ACMG in 1000G | 141466 |
| ClinVar-ACMG in 1000G | 6012 |
| LP/P-ACMG in 1000G | 135 |
clinvar_query.txt contains all results matched by the search query: “(APC[GENE] OR MYH11[GENE]… OR WT1[GENE]) AND (clinsig_pathogenic[prop] OR clinsig_likely_pathogenic[prop])” from the ClinVar website. The exact query is saved in /Supplementary_Files/query_input.txt
This presents another way of collecting data from ClinVar.
Intermediate step: convert hg38 locations to hg19 using the Batch Coordinate Conversion tool (liftOver) from UCSC Genome Browser Utilities.
clinvar_query <- read.table(file = "Supplementary_Files/clinvar_result_2016_12_04.txt", sep = "\t", header = F, skip = 1, stringsAsFactors = F)
colnames(clinvar_query) <- c("Name", "Gene(s)", "Condition(s)", "Frequency", "Clinical significance (Last reviewed)","Review status", "Chromosome","Location","Assembly","VariationID","AlleleID(s)")
clinvar_count <- NULL
clinvar_count <- clinvar_count %>% c(nrow(clinvar_query)) #1
clinvar_query <- clinvar_query[grepl(".>.",clinvar_query$Name),]
clinvar_count <- clinvar_count %>% c(nrow(clinvar_query)) #2
clinvar_query <- clinvar_query[!grepl(" - ", clinvar_query$Location) &
!grepl("|",clinvar_query$Chromosome, fixed = T) &
!(clinvar_query$Location == "") &
!grepl("Conflicting", clinvar_query$`Clinical significance (Last reviewed)`),]
clinvar_count <- clinvar_count %>% c(nrow(clinvar_query)) #3
clinvar_query <- clinvar_query %>%
mutate(Name = sub(".*(.>.).*","\\1", clinvar_query$Name)) %>%
mutate(Location = as.integer(Location)) %>%
separate(Name, into = c("Alternate","Reference"), sep = ">")
#liftOver from http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/liftOverMerge
#input data from http://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/
#paste(paste0("chr",clinvar_query$Chromosome), clinvar_query$Location, clinvar_query$Location+1) %>%
# write.table(file = "Supplementary_Files/cvquery_hg38_2016_12_04.bed", quote = F, row.names = F, col.names = F)
#system("chmod +x ../Tools/liftOver")
#system("../Tools/liftOver Supplementary_Files/cvquery_hg38_2016_12_04.bed ../Tools/hg38ToHg19.over.chain.gz Supplementary_Files/cvquery_hg19_2016_12_04.bed err.log")
cvquery_hg19 <- read.table(file = "Supplementary_Files/cvquery_hg19_2016_12_04.bed", sep = "\t", header = F, stringsAsFactors = F)
clinvar_query <- clinvar_query %>%
mutate(Location = cvquery_hg19$V2) %>%
unite(col = "VAR_ID", Chromosome, Location, Reference, Alternate, sep = "_", remove = F)
clinvar_count <- clinvar_count %>% c(sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID), #4
sum(clinvar_query$VAR_ID %in% clinvar$VAR_ID[clinvar$INTERP]), #5
sum(clinvar_query$VAR_ID %in% merged_gnomad$VAR_ID), #6
sum(clinvar_query$VAR_ID %in% merged_exac$VAR_ID), #7
sum(clinvar_query$VAR_ID %in% merged_1000g$VAR_ID)) #8
clinvar_query[(clinvar_query$VAR_ID %in% merged_exac$VAR_ID) & (clinvar_query$VAR_ID %in% merged_1000g$VAR_ID),] -> temp
cat(sprintf("ClinVar Query Results Table (substitutions only): %s x %s (selected rows/columns)",
nrow(clinvar_query), ncol(clinvar_query)))
## ClinVar Query Results Table (substitutions only): 6445 x 13 (selected rows/columns)
clinvar_query %>% filter(clinvar_query$VAR_ID %in% merged_gnomad$VAR_ID) %>%
slice(1:5) %>% select(c(1,4:8)) %>%
mutate(`Condition(s)` = `Condition(s)` %>% strsplit("|", fixed = T) %>% sapply("[",1)) %>%
mutate(Frequency = Frequency %>% strsplit(", ") %>% sapply("[",1) ) %>%
mutate(`Gene(s)` = `Gene(s)` %>% strsplit("|", fixed = T) %>% sapply("[",1) ) %>% pander
| VAR_ID | Gene(s) | Condition(s) | Frequency |
|---|---|---|---|
| 1_17350520_G_C | SDHB | Paragangliomas 4 | NA |
| 1_45798631_T_A | MUTYH | Hereditary cancer-predisposing syndrome | NA |
| 1_201334766_A_T | TNNT2 | Familial hypertrophic cardiomyopathy 2 | NA |
| 3_38646328_G_C | SCN5A | Atrial fibrillation, familial, 10 | NA |
| 3_38647447_G_C | SCN5A | Atrial fibrillation, familial, 10 | NA |
Table continues below
| Clinical significance (Last reviewed) | Review status |
|---|---|
| Pathogenic/Likely pathogenic, not provided(Last reviewed: Feb 2, 2015) | criteria provided, single submitter |
| Pathogenic(Last reviewed: Sep 17, 2015) | criteria provided, single submitter |
| Pathogenic/Likely pathogenic(Last reviewed: Feb 27, 2016) | criteria provided, multiple submitters, no conflicts |
| Pathogenic(Last reviewed: Apr 15, 2008) | no assertion criteria provided |
| Pathogenic(Last reviewed: Apr 15, 2008) | no assertion criteria provided |
cat("Breakdown of ClinVar Query Results Table: ")
## Breakdown of ClinVar Query Results Table:
data.frame(Subset = c("Initial Count","Filter Substitutions (N>N')","Filter Coupling/Bad-Locations","In ClinVar VCF","In LP/P-ClinVar","In LP/P-ACMG & gnomAD","In LP/P-ACMG & ExAC","In LP/P-ACMG & 1000G"), Number_of_Variants = clinvar_count) %>% pander
| Subset | Number_of_Variants |
|---|---|
| Initial Count | 14097 |
| Filter Substitutions (N>N’) | 7039 |
| Filter Coupling/Bad-Locations | 6445 |
| In ClinVar VCF | 494 |
| In LP/P-ClinVar | 493 |
| In LP/P-ACMG & gnomAD | 45 |
| In LP/P-ACMG & ExAC | 33 |
| In LP/P-ACMG & 1000G | 1 |
cat("Note the large reduction after merging the online query results with the VCF.")
## Note the large reduction after merging the online query results with the VCF.
### For plotting population level data:
prettyprint <- function(values, sd, title, xlabel, ylabel, ylimit) {
if (missing(sd)) sd <- TRUE
if (missing(title)) title <- NULL
if (missing(xlabel)) xlabel <- "Population"
if (missing(ylabel)) ylabel <- NULL
if (missing(ylimit)) ylimit <- NULL
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
plot.pop <- ggplot(values, aes(x=Population, y=Mean, fill = Superpopulation)) +
geom_bar(stat = "identity") + ggtitle(title) + xlab(xlabel) + ylab(ylabel) +
theme_minimal() + theme(axis.text.x = element_text(angle = -45, hjust = 0.4))
if (sd) {
if (min(values$Mean - values$SD)<0)
plot.pop <- plot.pop + geom_errorbar(aes(
ymin = pmax(0,values$Mean - values$SD),
ymax = values$Mean + values$SD, width = 0.5))
else
plot.pop <- plot.pop + geom_errorbar(aes(ymin = values$Mean - values$SD,
ymax = values$Mean + values$SD, width = 0.5))
} else {values$SD = 0}
if (length(ylimit)==2)
plot.pop <- plot.pop + ylim(ylimit[1],ylimit[2])
else
plot.pop <- plot.pop + ylim(0, 1.1*max(values$Mean + values$SD))
plot.pop
}
plot_af_distrib <- function() {
af_distrib <- data.frame(Index = 2:max(nrow(merged_1000g),nrow(merged_exac))-1,
AF_1000G = sort(merged_1000g$AF_1000G[merged_1000g$AF_1000G != max(merged_1000g$AF_1000G)],
decreasing = T) %>% c(rep(NA,nrow(merged_exac)-nrow(merged_1000g))),
AF_EXAC = sort(merged_exac$AF_EXAC[merged_exac$AF_EXAC != max(merged_exac$AF_EXAC)],
decreasing = T)) %>%
gather(Dataset, Allele_Frequency, AF_1000G, AF_EXAC) %>%
filter(!is.na(Allele_Frequency))
ggplot(aes(x = Allele_Frequency, color = Dataset), data = af_distrib) +
geom_density() + facet_grid(Dataset ~ .) + xlab("Allele Frequency") + ylab("Density") +
theme(legend.position="none")
}
#plot_af_distrib()
data.frame(
Spectrum_1000_Genomes = table(round(ACMG.1000g$AF_1000G*5008))[as.character(1:20)] %>% as.vector,
Spectrum_ExAC = table(ACMG.exac$Allele.Count)[as.character(1:20)] %>% as.vector,
Spectrum_gnomAD = table(ACMG.gnomad$Allele.Count)[as.character(1:20)] %>% as.vector
) %>% gather(Dataset, Frequency) %>%
ggplot(aes(x = c(1:20,1:20,1:20), y = Frequency)) +
geom_bar(aes(fill = Dataset), position = "dodge", stat = "identity") +
xlab("Allele Counts") + ggtitle("Allele Count Spectrum (1-20)") +
theme(legend.position="bottom")
#hist(round(ACMG.1000g$AF_1000G*5008) %>% table -> temp, breaks = 300000, xlim = c(0,20),
# main = "Allele Frequency Spectrum", xlab = "Allele Count")
#hist(table(ACMG.exac$Allele.Count)+0.2, breaks = 120000, xlim = c(0,20), ylim = c(0,1100),
# add = T, col = 'red')
#Test of Poissonness
x = table(merged_1000g$AF_1000G)
k = 1:length(x)-1
poissondata = data.frame(k=k, poissonness = as.vector(log(x)+lfactorial(k)))
The distribution of allele frequencies is approximately Poisson, with “Poissonness plot” correlation = 0.99. The Poissonness plot (Hoaglin 1980) is defined as the plot of \(log(x_k) + log(k!)\) vs. \(k\), as shown below:
ggplot(aes(x=k,y=poissonness), data = poissondata) + geom_point() + ylab("log(x_k) + log(k!)") + ggtitle("Poissonness Plot")
var_plot_1000g <- function(pathogenic, frac) {
if (pathogenic){
KP <- sapply(merged_1000g$CLNSIG, function(x) 5 %in% x)
KP_only <- c("RET","PRKAG2","MYH7","TNNI3","TPM1","MYL3","CACNA1S",
"DSP","MYL2","APOB","PCSK9","RYR1","RYR2","SDHAF2","ACTC1")
ACMG.data <- merged_1000g[!((merged_1000g$GENE %in% KP_only) & (!KP)),]
} else {
ACMG.data <- ACMG.1000g
}
front_cols <- 1:(grep("HG00096",colnames(ACMG.data))-1)
recessive <- ACMG.data$GENE %in% c("MUTYH","ATP7B")
sapply(pop.levels, function(pop) {
keep <- c(front_cols,map$pop)==pop
if (frac) {
temp <- (colSums(ACMG.data[!recessive,keep])>0) +
(colSums(ACMG.data[ recessive,keep])>1)
} else {
#Counts the number of non-reference sites in a gene
temp <- colSums(ACMG.data[,keep]>0)
}
c(mean(temp), sd(temp))
}) %>% t %>% tbl_df -> values #Number of non-reference sites across the different populations
colnames(values) <- c("Mean","SD")
values$Population <- factor(pop.levels, levels = pop.levels)
values$Superpopulation <- factor(super[pop.levels], levels = super.levels)
title <- sprintf("ACMG-59%s: %s in 1000 Genomes",
ifelse(pathogenic, " Pathogenic",""), ifelse(frac,"Fraction","Mean"))
prettyprint(values, title = title, sd = F, ylimit = NULL,
xlabel = "Population",
ylabel = ifelse(frac, "Fraction with at least 1 non-reference site",
"Mean No. of Non-Reference Sites")
)
}
var_plot_exac <- function(dataset, pathogenic, frac) {
ACMG.data <- parse(text=paste0(ifelse(pathogenic, "merged_", "ACMG."), tolower(dataset))) %>% eval
exac_prob <- 1-(1-ACMG.data[,sprintf("AF_%s_%s", toupper(dataset), super.levels)])^2
if (frac) {
exac_values <- data.frame(1-apply(1-exac_prob, 2, prod, na.rm = T), super.levels)
} else {
exac_values <- data.frame(exac_prob %>% colSums(na.rm = T), super.levels)
}
colnames(exac_values) = c("values","Superpopulation")
ggplot(exac_values, aes(x = Superpopulation, y=values, fill = Superpopulation)) +
geom_bar(stat = "identity") + theme_minimal() +
ggtitle(sprintf("ACMG-59%s: %s in %s",
ifelse(pathogenic, " Pathogenic",""), ifelse(frac,"Fraction","Mean"), dataset)) +
xlab("Population") + ylab("Mean No. of Non-Reference Sites") +
ylim(0,1.1*max(exac_values$values))
}
Each individual has \(n\) non-reference sites, which can be found by counting. The mean number is computed for each population.
Ex: the genotype of 3 variants in 3 people looks like this:
example <- ACMG.1000g[3142:3144,sprintf("HG00%d",366:368)]
rownames(example) <- c("Variant 1", "Variant 2","Variant 3")
example %>% pander
| HG00366 | HG00367 | HG00368 | |
|---|---|---|---|
| Variant 1 | 2 | 1 | 1 |
| Variant 2 | 2 | 1 | 1 |
| Variant 3 | 1 | 0 | 0 |
Count the number of non-reference sites per individual:
colSums(example>0) %>% pander
| HG00366 | HG00367 | HG00368 |
|---|---|---|
| 3 | 2 | 2 |
cat(sprintf("Mean = %s", mean(colSums(example>0)) %>% signif(3)))
## Mean = 2.33
var_plot_1000g(pathogenic = F, frac = F)
Note: the error bars denote standard deviation, not standard error.
The mean number of non-reference sites is \(E(V)\), where \(V = \sum_{i=1}^n v_i\) is the number of non-reference sites at all variant positions \(v_1\) through \(v_n\).
At each variant site, the probability of having at least 1 non-reference allele is \(P(v_i) = P(v_{i,a} \cup v_{i,b})\), where \(a\) and \(b\) indicate the 1st and 2nd allele at each site.
If the two alleles are independent, \(P(v_{i,a} \cup v_{i,b})\) = \(1-(1-P(v_{i,a}))(1-P(v_{i,b})) = 1-(1-AF(v_i))^2\)
If all variants are independent, \(E(V) = \sum_{i=1}^n 1-(1-AF(v_i))^2\) for any set of allele frequencies.
Ex: the allele frequencies of 3 variants across the 5 superpopulations looks like this:
example <- rbind(c(0.1,0.2,0,0,0.3),c(0.2,0,0.3,0,0.1)) %>% as.data.frame
rownames(example) <- c("Variant 1", "Variant 2")
colnames(example) <- super.levels
example %>% pander
| AFR | AMR | EAS | EUR | SAS | |
|---|---|---|---|---|---|
| Variant 1 | 0.1 | 0.2 | 0 | 0 | 0.3 |
| Variant 2 | 0.2 | 0 | 0.3 | 0 | 0.1 |
The probability of having at least 1 non-reference site at each variant - (0|1) (1|0) or (1|1) is given by \(1-(1-AF)^2\). Note that this is approximately \(2*AF\) when \(AF\) is small:
as.data.frame(1-(1-example)^2) %>% pander
| AFR | AMR | EAS | EUR | SAS | |
|---|---|---|---|---|---|
| Variant 1 | 0.19 | 0.36 | 0 | 0 | 0.51 |
| Variant 2 | 0.36 | 0 | 0.51 | 0 | 0.19 |
By linearity of expectation, the expected (mean) number of non-reference sites is \(\sum E(V_i) = \sum (columns)\).
colSums(1-(1-example)^2) %>% pander
| AFR | AMR | EAS | EUR | SAS |
|---|---|---|---|---|
| 0.55 | 0.36 | 0.51 | 0 | 0.7 |
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
var_plot_exac("gnomAD", pathogenic = F, frac = F)
#var_plot_exac("ExAC", pathogenic = F, frac = F)
This is the same procedure as above, but performed only on the subset of variants that are pathogenic.
var_plot_1000g(pathogenic = T, frac = F)
var_plot_exac("gnomAD", pathogenic = T, frac = F)
#var_plot_exac("ExAC", pathogenic = T, frac = F)
We can count up the fraction of individuals with 1+ non-reference site(s) in each population. This is the fraction of individuals who would receive a positive genetic test result in at least 1 of the ACMG-59 genes.
Ex: the genotype of 3 variants in 3 people looks like this:
example <- ACMG.1000g[3142:3144,sprintf("HG00%d",366:368)]
rownames(example) <- c("Variant 1", "Variant 2","Variant 3")
example %>% pander
| HG00366 | HG00367 | HG00368 | |
|---|---|---|---|
| Variant 1 | 2 | 1 | 1 |
| Variant 2 | 2 | 1 | 1 |
| Variant 3 | 1 | 0 | 0 |
Count each individual as having a non-reference site (1) or having only reference sites (0):
(1*(colSums(example>0)>0)) %>% pander
| HG00366 | HG00367 | HG00368 |
|---|---|---|
| 1 | 1 | 1 |
cat(sprintf("Mean = %s", mean(1*(colSums(example>0)>0)) %>% signif(3)))
## Mean = 1
var_plot_1000g(pathogenic = T, frac = T)
The probability of having at least 1 non-reference site is \(P(X)\), where \(X\) indicates a non-reference site at any variant position \(v_1\) through \(v_n\).
Recall that \(P(v_i) = P(v_{i,a} \cup v_{i,b}) = 1-(1-AF(v))^2\) when alleles are independent.
If all alleles are independent, \(P(X) = P(\bigcup_{i=1}^n v_i) = 1-\prod_{i=1}^n (1-AF(v_i))^2\)
Ex: the allele frequencies of 3 variants across the 5 superpopulations looks like this:
example <- rbind(c(0.1,0.2,0,0,0.3),c(0.2,0,0.3,0,0.1)) %>% as.data.frame
rownames(example) <- c("Variant 1", "Variant 2")
colnames(example) <- super.levels
example %>% pander
| AFR | AMR | EAS | EUR | SAS | |
|---|---|---|---|---|---|
| Variant 1 | 0.1 | 0.2 | 0 | 0 | 0.3 |
| Variant 2 | 0.2 | 0 | 0.3 | 0 | 0.1 |
The probability of having at least 1 non-reference site at each variant - (0|1) (1|0) or (1|1) is given by \(1-(1-AF)^2\).
Note that this is approximately \(2*AF\) when \(AF\) is small:
as.data.frame(1-(1-example)^2) %>% pander
| AFR | AMR | EAS | EUR | SAS | |
|---|---|---|---|---|---|
| Variant 1 | 0.19 | 0.36 | 0 | 0 | 0.51 |
| Variant 2 | 0.36 | 0 | 0.51 | 0 | 0.19 |
The expected (mean) number of non-reference sites is given by \(1-\prod (1-AF)^2\).
apply(example, 2, function(x) 1-prod((1-x)^2)) %>% pander
| AFR | AMR | EAS | EUR | SAS |
|---|---|---|---|---|
| 0.4816 | 0.36 | 0.51 | 0 | 0.6031 |
### ExAC
#Each element is the probability that at least 1 of the 2 alleles are non-reference.
var_plot_exac("gnomAD", pathogenic = T, frac = T)
#var_plot_exac("ExAC", pathogenic = T, frac = T)
### 1000 Genomes
merged_1000g %>% select(contains("AF_1000G_")) -> af_1000g_by_ancestry
rownames(af_1000g_by_ancestry) <- merged_1000g$VAR_ID
colnames(af_1000g_by_ancestry) <- super.levels
ord <- order(apply(af_1000g_by_ancestry,1,sum), decreasing = T)[1:8]
ranked_id <- row.names(af_1000g_by_ancestry)[ord]
ranked_var <- data.frame(Var_ID = factor(ranked_id, levels = ranked_id),
af_1000g_by_ancestry[ord,]) %>% gather(Ancestry, Subdivided_Allele_Frequencies, -Var_ID)
ggplot(ranked_var, aes(x = Var_ID, y = Subdivided_Allele_Frequencies, fill = Ancestry)) +
geom_bar(stat='identity', color = 'black', width = 0.7) +
ggtitle("High Frequency Variants in 1000 Genomes") + coord_flip()
### gnomAD
af_gnomad_by_ancestry <- merged_gnomad[,sprintf("AF_GNOMAD_%s",super.levels)]
colnames(af_gnomad_by_ancestry) <- super.levels
ord <- order(apply(af_gnomad_by_ancestry,1,sum), decreasing = T)[1:8]
ranked_id <- merged_gnomad$VAR_ID[ord]
ranked_var <- data.frame(Var_ID = factor(ranked_id, levels = ranked_id),
af_gnomad_by_ancestry[ord,]) %>%
gather(Ancestry, Subdivided_Allele_Frequencies, 2:6)
ggplot(ranked_var, aes(x = Var_ID, y = Subdivided_Allele_Frequencies, fill = Ancestry)) +
geom_bar(stat='identity', color = 'black', width = 0.7) +
ggtitle("High Frequency Variants in gnomAD") + coord_flip()
Let \(V_x\) be the event that an individual has 1 or more variant related to disease \(x\),
and \(D_x\) be the event that the individual is later diagnosed with disease \(x\).
In this case, we can define the following probabilities:
1. Prevalence = \(P(D_x)\)
2. Allele Frequency = \(P(V_x)\)
3. Allelic Heterogeneity = \(P(V_x|D_x)\)
4. Penetrance = \(P(D_x|V_x)\)
By Bayes’ Rule, the penetrance of a variant related to disease \(x\) may be defined as: \[P(D_x|V_x) = \frac{P(D_x)*P(V_x|D_x)}{P(V_x)} = \frac{Prevalence*Allelic.Heterogeneity}{Allele.Frequency}\]
To compute penetrance estimates for each of the diseases related to the ACMG-59 genes, we will use the prevalence data we collected into Literature_Prevalence_Estimates.csv, allele frequency data from 1000 Genomes and ExAC, and a broad range of values for allelic heterogeneity.
Data Collection: 1. Similar disease subtypes were grouped together (e.g., the 8 different types of familial hypertrophic cardiomyopathy), resulting in 30 disease categories across 59 genes.
2. The search query “[disease name] prevalence” was used to find articles using Google Scholar.
3. Prevalence estimates were recorded along with URL, journal, region, publication year, sample size, first author, population subset (if applicable), date accessed, and potential issues. Preference was given to studies with PubMed IDs, more citations, and larger sample sizes.
Prevalence was recorded as reported: either a point estimate or a range. Values of varying quality were collected across all diseases.
ACMG_Lit_Full <- read.csv(file = "Supplementary_Files/Literature_Prevalence_Estimates.csv",
header = TRUE, stringsAsFactors = F, na.strings = "\\N")
ACMG_Lit <- ACMG_Lit_Full %>% filter(Evaluate)
abbrev <- ACMG_Lit$Short_Name
abbrev_all <- ACMG_Lit_Full$Short_Name
inv.prev <- ACMG_Lit$Inverse_Prevalence %>% as.numeric %>% setNames(abbrev)
expand_pipes <- function(item) { strsplit(item, "|", fixed = T) %>% unlist }
gene.list <- expand_pipes(ACMG_Lit_Full$Gene)
report <- setNames(ACMG_Lit_Full$Variants_to_report %>% expand_pipes(), gene.list)[!duplicated(gene.list)]
inheritance <- setNames(ACMG_Lit_Full$Inheritance %>% expand_pipes(), gene.list)[!duplicated(gene.list)]
cat(sprintf("Table of Literature-Based Estimates %s x %s (selected rows/columns):", nrow(ACMG_Lit), ncol(ACMG_Lit)))
## Table of Literature-Based Estimates 22 x 18 (selected rows/columns):
ACMG_Lit[c(4,5,8,14),] %>% remove_rownames %>%
select(Gene, Phenotype, Abbreviation, Inverse_Prevalence, Allelic_Heterogeneity) %>% pander
| Gene | Phenotype | Abbreviation | Inverse_Prevalence |
|---|---|---|---|
| MEN1 | Multiple endocrine neoplasia type 1 | MEN1 | 30000 |
| RET | Multiple endocrine neoplasia type 2; FMTC | MEN2 | 35000 |
| TSC1|TSC2 | Tuberous sclerosis complex | TSC | 5800 |
| GLA | Fabry’s Disease | Fabry | 1600 |
Table continues below
| Allelic_Heterogeneity |
|---|
| 0.9 |
| 0.98 |
| 0.9 |
| 1 |
acmg_ah <- ACMG_Lit$Allelic_Heterogeneity %>% as.numeric
data.frame(Disease = abbrev,
Inverse_Prevalence = inv.prev %>% setNames(NULL)) %>%
ggplot(aes(x = factor(Disease, levels = Disease[order(Inverse_Prevalence)]), y = Inverse_Prevalence)) +
geom_point(stat = 'identity') + coord_flip() + xlab("Disease") +
scale_y_continuous(trans='log10', breaks = 10^(0:6),
labels = sapply(0:6, function(x) paste0("1",paste0(rep("0",x), collapse = "")))) +
theme(axis.text.y=element_text(size=7)) +
ggtitle("Distribution of Inverse Prevalences (log-scale)") + ylab("Inverse Prevalence")
We define AF(disease) as the probability of having at least 1 variant associated with the disease.
The frequencies across the relevant variants can be aggregated in two ways:
(1) By direct counting, from genotype data in 1000 Genomes.
(2) AF(disease) = \(1-\prod_{variant}(1-AF_{variant})\), from population data in ExAC (assumes independence).
sample_size <- list()
sample_size$Cohort_1000G <- table(map$super_pop) %>%
as.numeric %>% setNames(super.levels) %>% c("1000G"=2504)
sample_size$Cohort_EXAC <- c("AFR"=5203,"AMR"=5789,"EAS"=4327,
"EUR"=3307+33370,"SAS"=8256,"EXAC"=60252)
sample_size$Cohort_GNOMAD <- c("AFR"=12942,"AMR"=18237,"EAS"=9472,
"EUR"=5081+13046+63416,"SAS"=15450,"GNOMAD"=137644)
aggregateCalc <- function(input, superpop, item, dataset, loc) {
find = sprintf("AF_%s",toupper(dataset))
if (superpop!=dataset)
find = paste(find, superpop, sep = "_")
# Aggregation by calculation + ind assumption
freq <- input[loc,find] %>% unlist %>% as.numeric #vector of all allele frequencies
#if (sum(freq, na.rm = T)==0) { # prob after 0 obs using Laplace's rule of succession.
# freq <- 1/(2+2*sample_size[[paste0("Cohort_",dataset)]][superpop])
#} else {
if (inheritance[item] == "AR")
freq <- freq^2 #AR: prob of having a non-reference site at BOTH alleles
if (inheritance[item] %in% c("AD","SD"))
freq <- 1-(1-freq)^2 #AD/SD: prob of having a non-reference site at EITHER allele
if (inheritance[item] == "XL")
freq <- freq #XL: prob of having a non-reference site at ONE allele (male)
freq <- 1-prod(1-freq[!is.na(freq)]) # prob of having a non-reference site in 1 chrom
#}
return(freq)
}
aggregateCount <- function(input, superpop, item, dataset, loc) {
# Aggregation by counting
front_cols <- 1:(grep("HG00096",colnames(input))-1)
find <- (1:ncol(input))[-front_cols]
if (superpop != dataset)
find <- length(front_cols)+which(map$super_pop==superpop)
if (inheritance[item] %in% c("AD","SD"))
reduced_input <- input[loc, find]
if (inheritance[item] == "AR")
reduced_input <- input[loc, find]-1 #Looking for 2s
if (inheritance[item] == "XL") {
male <- length(front_cols)+which(map$gender=="male")
reduced_input <- input[loc,intersect(find,male)]
}
freq <- mean(apply(reduced_input, 2, function(col) any(col>=1)))
#if (freq==0)
# freq <- 1/(2+2*sample_size[["Cohort_1000G"]][superpop])
return(freq)
}
getAlleleFreq <- function(input, ind, dataset) {
search_list <- ACMG_Lit_Full$Gene
search_in <- "GENE" #CLNDSDBID
if (!("CLNSIG" %in% colnames(input)))
input$CLNSIG <- rep(5, nrow(input))
KP_only <- grepl(5,input$CLNSIG)
data.frame(search_list,
sapply(search_list, function(item.vec) {
item.vec.split <- expand_pipes(item.vec)
sapply(item.vec.split, function(item) {
loc <- grepl(item,input[,search_in], ignore.case = T)
if (!grepl("EP",report[[item]])) #If we're only taking KP
loc <- loc & KP_only #Take all relevant genes
sapply(c(dataset,super.levels), function(superpop) {
if (ind) {
return(aggregateCalc(input, superpop, item, dataset, loc))
} else {
return(aggregateCount(input, superpop, item, dataset, loc))
}
})
}) %>% apply(1, function(row) 1-prod(1-row)) %>%
setNames(sprintf("AF_%s%s",toupper(dataset), c("",paste0("_",super.levels))))
}) %>% t %>% tbl_df
)
}
# Other methods MIM and Tags
# Do NOT use MIM if CLNDSDBID is missing (older VCFs)
#input = ACMG.1000g[select_rows,]; ind = F; method = "Gene"; dataset = "1000G"
freq_1000g.count.gene <- getAlleleFreq(input = merged_1000g, ind = F, dataset = "1000G")
freq_1000g.calc.gene <- getAlleleFreq(input = merged_1000g, ind = T, dataset = "1000G")
freq_gnomad.calc.gene <- getAlleleFreq(input = merged_gnomad, ind = T, dataset = "GNOMAD")
freq_exac.calc.gene <- getAlleleFreq(input = merged_exac, ind = T, dataset = "EXAC")
allele.freq <- data.frame(
COUNT_1000G = freq_1000g.count.gene$AF_1000G,
CALC_1000G = freq_1000g.calc.gene$AF_1000G,
CALC_GNOMAD = freq_gnomad.calc.gene$AF_GNOMAD,
CALC_EXAC = freq_exac.calc.gene$AF_EXAC
)
row.names(allele.freq) <- abbrev_all
save(freq_1000g.calc.gene, freq_1000g.count.gene,
freq_exac.calc.gene, freq_gnomad.calc.gene,
file = "Shiny_App/disease_level_AF.RData")
af_files <- c("freq_1000g.count.gene","freq_1000g.calc.gene","freq_gnomad.calc.gene","freq_exac.calc.gene")
lapply(af_files, function(file) {
write.csv(eval(parse(text=file)),
file = sprintf("Supplementary_Files/Processed_Files/%s.csv",file))
}) %>% invisible
write.csv(allele.freq, file = sprintf("Supplementary_Files/Processed_Files/freq.all.gene.csv",file))
#cor(allele.freq) %>% as.data.frame %>% pander
ggplot(aes(x = CALC_1000G, y = CALC_GNOMAD), data = allele.freq) +
geom_point(stat = "identity", col = 'red') +
geom_text_repel(aes(label = abbrev_all), size = 3) +
scale_x_continuous(limits = c(10^-6, max(allele.freq[,"CALC_GNOMAD"]))*5,
trans='log10', breaks = 10^-(0:6)) +
scale_y_continuous(limits = c(10^-6, max(allele.freq[,"CALC_GNOMAD"]))*5,
trans='log10', breaks = 10^-(0:6)) +
xlab("Allele Frequency (1000 Genomes)") + ylab("Allele Frequency (gnomAD)") +
geom_abline(slope = 1, intercept = 0) + ggtitle("Scatterplot: gnomAD v. 1000 Genomes")
Ratio_1000G (red, top) computes AF(calculation in 1000 Genomes) / AF(counting in 1000 Genomes).
Ratio_gnomAD (blue, bottom) computes AF(calculation in gnomAD) / AF(calculation in 1000 Genomes).
#Ranked by max of the of the 2 ratios in each disease
ratio_diff <- function() {
ratio <- data.frame(Means = allele.freq$COUNT_1000G,
Ratio_1000G = (pmax(allele.freq$CALC_1000G, allele.freq$COUNT_1000G)/
pmin(allele.freq$CALC_1000G, allele.freq$COUNT_1000G)),
Ratio_gnomAD = (pmax(allele.freq$CALC_GNOMAD, allele.freq$CALC_1000G)/
pmin(allele.freq$CALC_GNOMAD, allele.freq$CALC_1000G))) %>%
mutate(Disease = factor(abbrev_all,
levels = abbrev_all[order(pmax(Ratio_gnomAD,Ratio_1000G))])) %>%
gather(Dataset, Ratio, Ratio_1000G, Ratio_gnomAD) %>%
filter(is.finite(Ratio))
ggplot(aes(x=Disease, y=Ratio, color = Dataset), data = ratio) + coord_flip() +
geom_point(stat = 'identity') + facet_wrap("Dataset", ncol = 1) +
ggtitle("Ratios of Allele Frequencies from Different Methods") +
scale_y_continuous(breaks = seq(0,100,1)) + theme(legend.position="none")
}
ratio_diff()
Sampling 1000 variants from all variants in 1000 Genomes to test deviations from independence assumptions.
Repeat for 1000 trials and plot the distribution of disease-level allele frequencies (1000 points per disease).
Only variants with allele frequency > 0.01 are evaluated. Since we look at 17 variants per disease, the maximum is approximately \(1-(1-0.01)^{34} \approx 0.29\)
plot_ind_test <- function() {
ind_test_data <- readRDS("Supplementary_Files/ind_test.rds")
#set.seed(123)
#do.call("rbind",lapply(1:1000, function(x) {
# print(x)
# lapply(ACMG.panel, function(gene){
# sites <- which(ACMG.1000g$GENE==gene)
# small_sites <- sites[ACMG.1000g[sites,"AF_1000G"] < 0.01]
# sample(small_sites,17)
# }) %>% unlist -> select_rows
# data.frame(DISEASE = ACMG_Lit_Full$Abbreviation,
# COUNT = getAlleleFreq(ACMG.1000g[select_rows,],
# ind = F, dataset = "1000G")$AF_1000G,
# CALC = getAlleleFreq(ACMG.1000g[select_rows,],
# ind = T, dataset = "1000G")$AF_1000G)
#})) -> ind_test_data
#saveRDS(ind_test_data, "Supplementary_Files/ind_test.rds")
#plot(ind_test_data %>% ggplot(aes(x=CALC-COUNT)) + geom_histogram(bins = 100) +
# xlab("Calculation - Counting") + ylab("Histogram Counts"))
#plot(ind_test_data %>% filter(COUNT>0) %>% ggplot(aes(x=pmax.int(CALC/COUNT, COUNT/CALC))) +
# geom_histogram(bins = 100) + xlab("Calculation/Counting") + ylab("Histogram Counts") + xlim(0,4))
#+ facet_wrap("DISEASE", ncol = 3))
sapply(levels(ind_test_data$DISEASE), function(d) {
ind_test_data %>% filter(DISEASE == d) %>%
mutate(DIFF = CALC-COUNT) %>% mutate(RATIO = pmax(CALC/COUNT,COUNT/CALC)) %>%
select(RATIO) %>% unlist
}) %>% data.frame %>% gather(Disease, Data) %>% filter(is.finite(Data)) -> plot_data
plot(plot_data %>% filter(!(Disease %in% c("Wilson","HCRC.AR","OTC"))) %>%
ggplot(aes(x = Disease, y = Data)) + geom_boxplot() + coord_flip() +
ylab("Allele Frequency Difference: Calculation/Counting") +
ggtitle("Differences in AF Methods: by Disease"))
plot(plot_data %>% filter(Disease %in% c("Wilson","HCRC.AR","OTC")) %>%
ggplot(aes(x = Data)) + geom_histogram(bins = 100) +
facet_wrap("Disease", ncol = 1, scales = "free") +
xlab("Allele Frequency Difference: Calculation/Counting") +
ggtitle("Differences in AF Methods: by Disease (Outliers)"))
plot(ggplot(aes(x = COUNT, y = CALC), data = ind_test_data[runif(nrow(ind_test_data)) < 0.1,]) +
geom_point() + ggtitle("Testing Independence with Random Sampling") +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.1)) +
xlab("Allele Frequency (From Counting)") + ylab("Allele Frequency (From Calculation)") +
geom_abline(slope = 1, intercept = 0, color = 'blue') +
geom_text(aes(label = replace(DISEASE,which(abs(COUNT - CALC) < 0.3), NA)),
check_overlap = F, hjust = 1, na.rm = T, size = 2.5)
)
cat("30 diseases x 1000 points = 30,000 points.")
cat("This plot has been downsampled 10x and contains 3,000 points.")
ind_test_data %>% filter(COUNT>0)
}
plot_ind_out <- plot_ind_test()
## 30 diseases x 1000 points = 30,000 points.This plot has been downsampled 10x and contains 3,000 points.
sprintf("Pearson correlation: %s",
signif(cor(plot_ind_out$COUNT,plot_ind_out$CALC),3)) %>% cat
## Pearson correlation: 0.995
sprintf("Mean ratio (Calculation/Counting): %s",
signif(mean(plot_ind_out$CALC/plot_ind_out$COUNT, na.rm = T),3)) %>% cat
## Mean ratio (Calculation/Counting): 0.971
rm(plot_ind_out)
\newpage
The left end of the boxplot indicates P(V|D) = 0.01,
the bold line in the middle indicates P(V|D) = point value,
the right end of the boxplot indicates P(V|D) = 1.
if (nrow(allele.freq)==nrow(ACMG_Lit_Full))
allele.freq <- allele.freq[ACMG_Lit_Full$Evaluate,]
get_penetrance <- function(ah_low, ah_high, dataset) {
# Map of disease name to disease tags
if (dataset == "1000 Genomes")
named.freqs <- allele.freq$COUNT_1000G %>% setNames(abbrev)
if (dataset == "gnomAD")
named.freqs <- allele.freq$CALC_GNOMAD %>% setNames(abbrev)
if (dataset == "ExAC")
named.freqs <- allele.freq$CALC_EXAC %>% setNames(abbrev)
named.prev <- 1/inv.prev %>% setNames(abbrev)
# Repeats allow for correct quartile calculations
#point estimate set to arithmetic mean
allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high) %>%
rep(nrow(ACMG_Lit)) %>% matrix(nrow = nrow(ACMG_Lit), byrow = T)
allelic.het[,3] <- acmg_ah
#Functions to transform data points with disease_af = 0
set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
# Matrix of penetrance values for allelic het range, capped at 1
prev_freq <- named.prev/named.freqs %>% rep(5) %>% matrix(nrow = nrow(ACMG_Lit))
penetrance <- apply(allelic.het * prev_freq, 1, set_to_na) %>% as.data.frame
# Take row 5 to sort by max, take colSums to sort by overall
# Break ties by rows 3 and 1 (mean and low)
ord <- order(penetrance[5,], penetrance[3,], penetrance[1,], decreasing = T)
# replicate each element n times to create labels
penetrance_data <- data.frame("Penetrance" = penetrance %>% unlist, "Disease" =
factor(sapply(abbrev, function(x) rep(x,5)) %>% as.vector,
levels = abbrev[ord]) )
colormap <- rep('black', length(abbrev))
num_na <- mean(is.na(penetrance_data$Penetrance))*length(colormap)
if (num_na != 0)
colormap[1:num_na] <- 'gray60'
colormap <- rev(colormap)
plot(
ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) +
geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() +
ggtitle(sprintf("%s: Barplot of Min/Point/Max Penetrance", dataset)) +
theme(axis.text.y = element_text(color = colormap)) + ylim(0,1)
#annotate("text", x = which(colormap == 'red'), y = 0,
# label = "No Allele Frequency Data", hjust = 0, size = 3)
)
penetrance_data
}
pen_gnomad <- get_penetrance(ah_low = 0.01, ah_high = 1, dataset = "gnomAD")
pen_1000g <- get_penetrance(ah_low = 0.01, ah_high = 1, dataset = "1000 Genomes")
#ah_low = 0.01; ah_high = 1; dataset = "gnomAD"
Note: Some diseases have mean theoretical penetrance = 1 because the assumed allelic heterogeneity is greater than is possible, given the observed prevalence and allele frequencies.
\newpageancestry_penetrance <- function(ah_low, ah_high, dataset, range, position) {
pos <- replace(c(F,F,F,F,F), ifelse(position == "Max", 5, 3), T)
sapply(c("Total",super.levels), function(superpop){
# Map of disease name to disease tags
find <- paste0("AF_", toupper(dataset))
named.freqs <- eval(parse(text=sprintf("freq_%s.calc.gene", tolower(dataset))))[ACMG_Lit_Full$Evaluate,]
if (superpop != "Total")
find <- paste(find, superpop, sep = "_")
named.freqs <- named.freqs[,find] %>% unlist %>% setNames(abbrev)
allelic.het <- c(ah_low, ah_low, mean(c(ah_low, ah_high)) %>% signif(3), ah_high, ah_high) %>%
rep(nrow(ACMG_Lit)) %>% matrix(nrow = nrow(ACMG_Lit), byrow = T)
allelic.het[,3] <- acmg_ah
#make_prev_ranges(range) -> prev_final
# Matrix of penetrance values for allelic het range, capped at 1
set_to_na <- function(row) { replace(row, is.infinite(row),NA) %>% pmin(1)}
apply(allelic.het / ACMG_Lit$Inverse_Prevalence / named.freqs, 1, set_to_na) %>% unlist
}) %>% as.data.frame -> penetrance_init
### Quantifying ancestral variation
#temp <- penetrance_init[pos,]
#sort(apply(temp, 1, max, na.rm = T) - apply(temp, 1, min, na.rm = T))
#sort(apply(temp, 1, max, na.rm = T) / apply(temp, 1, min, na.rm = T))
# Take column 5 to sort by max, take rowSums to sort by overall
# Break ties by rows 3 and 1 (mean and low)
mat_pen <- matrix(penetrance_init$Total, ncol = 5, byrow = T)
ord <- order(mat_pen[,5], mat_pen[,3], mat_pen[,1], decreasing = T)
penetrance_data <- data.frame(penetrance_init,
"Disease" = factor(sapply(abbrev,
function(x) rep(x,5)) %>% as.vector,
levels = abbrev[ord]) )
#m <- list(l = 170, r = 220, b = -50, t = 50, pad = 5)
# heatmap <- plot_ly(
# x = factor(c(super.levels,"Total"), levels = c("Total",super.levels)),
# y = factor(sapply(abbrev, function(x) rep(x,5)) %>% as.vector, levels = abbrev[ord]),
# z = penetrance_init[pos,][ord,] %>% as.matrix, type = "heatmap", height = 700
# ) %>% layout(autosize = T, margin = m); heatmap
# Star/Radar Plot
temp <- penetrance_data[pos,] %>% select(-Disease, -Total)
rownames(temp) <- abbrev
order(rowSums(temp, na.rm = T), decreasing = T)[1:10] -> wanted
col <- 3
stars(temp[wanted,], scale = F, full = F, len = 1, nrow = ceiling(10/col), ncol = col,
flip.labels = F, key.loc = c(2*col,2),
main = sprintf("Radar Plot: %s Penetrance by Ancestry (%s)", position, dataset),
draw.segments = T, col.segments = c('red','yellow','green','blue','purple'))
print("These are the top 10 diseases by summed allele frequencies. NULL values are not plotted.", quote = F)
print("Each radius is proportional to the penetrance of the disease in the given population.", quote = F)
# Barplot
penetrance_data <- gather(penetrance_data, Subset, Penetrance, -Disease)
plot(ggplot(aes(x=Disease, y=Penetrance), data = penetrance_data) +
geom_boxplot(position = 'identity', coef = 0, na.rm = T) + coord_flip() +
facet_wrap(~Subset, ncol = 2) + ggtitle(sprintf("Barplot: Penetrance by Ancestry (%s)", dataset)) +
theme(axis.text.y=element_text(size=6), axis.text.x = element_text(angle = -20, hjust = 0.4))
)
# Heatmap
plot(ggplot(aes(x=Disease, y = Subset), data = penetrance_data[pos,]) + coord_flip() +
geom_tile(aes(fill = Penetrance), color = 'white') + xlab("Disease") + ylab("Ancestry") +
scale_fill_gradient(low='white',high = 'darkblue', na.value = "grey50",
breaks=c(0,0.25,0.5,0.75,1), labels=c("0","0.25","0.50","0.75","1.00"), limits =c(0,1)) +
ggtitle(sprintf("Heatmap: %s Penetrance by Ancestry (%s)", position, dataset)) +
theme_minimal() + theme(axis.ticks = element_blank()) +
annotate("segment", y=c(0.5,5.5,6.5), yend=c(0.5,5.5,6.5),
x=0.5, xend = length(abbrev)+0.5) +
annotate("segment", y=0.5, yend=6.5,
x=c(0.5,length(abbrev)+0.5),
xend = c(0.5,length(abbrev)+0.5))
)
cat("Dark gray boxes are NA: no associated variants discovered in that ancestral population.")
}
ancestry_penetrance(ah_low = 0.01, ah_high = 1, dataset = "gnomAD", range = 5, position = "Max")
## [1] These are the top 10 diseases by summed allele frequencies. NULL values are not plotted.
## [1] Each radius is proportional to the penetrance of the disease in the given population.
## Dark gray boxes are NA: no associated variants discovered in that ancestral population.
#ancestry_penetrance(ah_low = 0.01, ah_high = 1, dataset = "1000G", range = 5, position = "Max")
#ah_low = 0.01; ah_high = 1; dataset = "gnomAD"; range = 5; position = "Max"
## Empirical CDFs for All Penetrance Plots
pen_1000g$Penetrance[c(F,F,F,F,T)] %>% ecdf %>%
plot(ylab = "Fraction with max theoretical penetrance < P", xlab = "P",
main = "CDF: Penetrance ~ P(V|D)", ylim = c(0,1.02), pch = 19)
pen_gnomad$Penetrance[c(F,F,F,F,T)] %>% ecdf %>% plot(col = 'red', add = T, pch = 1)
legend("bottomright", legend = c("gnomAD","1000G"), col = c("red","black"), pch = c(1,19))
download.file(url = "http://www.omim.org/static/omim/data/mim2gene.txt", destfile = "Supplementary_Files/mim2gene.txt", method = "internal")
mim2gene <- read.table(file = "Supplementary_Files/mim2gene.txt", header = FALSE, sep = "\t", stringsAsFactors = F)
colnames(mim2gene) <- c("MIM_Number","MIM_Entry_Type","Entrez_Gene_ID (NCBI)","Approved_Gene_Symbol_(HGNC)","Ensembl_Gene_ID_(Ensembl)")
mim2gene <- mim2gene[mim2gene$`Approved_Gene_Symbol_(HGNC)`!="",]
ACMG.ENSG <- mim2gene[mim2gene$`Approved_Gene_Symbol_(HGNC)` %in% ACMG.table$Gene_Name,
"Ensembl_Gene_ID_(Ensembl)"] %>% strsplit(",") %>% sapply("[",1)
mybpc3.page <- scrape(url="http://gnomad.broadinstitute.org/gene/ENSG00000134571")[[1]]
mybpc3.page <- scrape(url="http://exac.broadinstitute.org/gene/ENSG00000134571")[[1]]
mybpc3.table <- readHTMLTable(mybpc3.page, stringsAsFactors = F, header = T)
colnames(mybpc3.table) <- NULL
## Test Statistics for Ancestral Differences
#F-statistic/T-statistic: probability that the different groups are sampled from distributions with the same mean. <br />
#These plots are from 4(a) - 1000 Genomes Fraction with 1+ Non-Reference Site, but can be replicated for plots 2(ab) and 3(ab) as well.
#Calculating test statistics (F-values)
Fcalc <- function(values, pop) {
if (missing(pop)) {
groups <- super[pop.levels]
} else {
groups <- ifelse(super[pop.levels]==pop,pop,"Other")
}
data <- data.frame(y = values, group = factor(groups))
color_map <- c("red","gold3","springgreen3","deepskyblue","violet","white") %>%
setNames(c("AFR","AMR","EAS","EUR","SAS","Other"))
out <- lm(y ~ group, data) %>% anova
plot(y ~ group, data, xlab = NULL, ylab = NULL,
col = color_map[sort(unique(groups))],
main = paste("F-statistic:",out$`Pr(>F)`[1] %>% signif(3)))
out
}
par(mfrow=c(2,3))
F_values <- c(Fcalc(values$Mean)$`Pr(>F)`[1] %>% setNames("Overall"),
sapply(super.levels, function(pop) {
Fcalc(values$Mean, pop)$`Pr(>F)`[1]
}))
par(mfrow=c(1,1), mar=c(5, 4, 4, 2)+0.1)
#F_values